#############################################################################
# Inference in GFE and GFE pp
# - Oracle property (taking groups as given)
# - Errors clustered at the state level
#
#############################################################################

rm(list=ls())
graphics.off()

library(data.table)
library(fixest)
library(xtable)
library(Hmisc)
library("RColorBrewer")
library(ggplot2)
library(ggpattern)

memory.limit(size=200000)

rm(list=ls())
timeb0 = Sys.time()

#############################################################################
# GFE results
#############################################################################

load("Data_new/GFE/GFE_results.RData")
dc = dc_g5
K=5

# Add groups to the data
dt_bounds = dt[,.SD,.SDcols=c("state","fips","year","corn_area","yield","ddr","precr")]
dim(dt_bounds)
dt_bounds = merge(dt_bounds,dc[,.(fips,year,group)],by=c("fips","year"),all=TRUE)
dim(dt_bounds)
summary(dt_bounds$group)

# Re-estimate the model given the groups
form = as.formula("yield ~ precr + ddr:factor(group) | factor(fips)  + factor(year)^factor(state)")
form
gfe = feols(fml=form,data=dt_bounds,cluster="state")

# Save lower and upper bound of estimated coefficients for the slopes
slopes = gfe$coeftable[paste0("ddr:factor(group)",1:K),1:2]
slopes = data.table(slopes)
setnames(slopes,c("coef_gfe","se_gfe"))
slopes[,lb_gfe:=coef_gfe-qnorm(0.975)*se_gfe]
slopes[,ub_gfe:=coef_gfe+qnorm(0.975)*se_gfe]
slopes[,group:=.I]
slopes

# Check they coefficient estimates are the same
table(dc[,.(group,mg)])

# Merge back to the data 
dim(dt_bounds)
dt_bounds = merge(dt_bounds,slopes,by="group",all=TRUE)
dim(dt_bounds)
summary(dt_bounds)

rm(list=setdiff(ls(),list("dt_bounds","timeb0")))


#############################################################################
# GFE per period results
#############################################################################

load("Data_new/GFE_period/GFE_periods_results.RData")
dc = dc_G4
K=4

# Add groups to the data
dim(dt_bounds)
dt_bounds = merge(dt_bounds,dc[,.(fips,year,group_new)],by=c("fips","year"),all=TRUE)
dim(dt_bounds)
setnames(dt_bounds,"group_new","group_pp")
summary(dt_bounds$group_pp)

# Generate period variable
dt_bounds[year>=1950 & year<=1968, period:=1]
dt_bounds[year>=1969 & year<=1987, period:=2]
dt_bounds[year>=1988 & year<=2005, period:=3]
summary(dt_bounds$period)

# Generate interactions of period with ddr
dt_bounds[,ddr_p1:=ddr*(period==1)]
dt_bounds[,ddr_p2:=ddr*(period==2)]
dt_bounds[,ddr_p3:=ddr*(period==3)]
summary(dt_bounds[,.(ddr_p1,ddr_p2,ddr_p3)])

# Re-estimate the model given the groups
form = as.formula("yield ~  precr + ddr_p1:factor(group_pp) + ddr_p2:factor(group_pp) + ddr_p3:factor(group_pp) | 
                  factor(fips)^factor(period) + factor(year)^factor(state)")
gfe_pp = feols(fml=form,data=dt_bounds,cluster="state")

# Check we get the same coefficients
summary(gfe_pp)
table(dc_G4[,.(period,mg)])

# Save lower and upper bound of estimated coefficients for the slopes
nn =c(paste0("ddr_p1:factor(group_pp)",1:K),paste0(paste0("factor(group_pp)",1:K),":ddr_p2"),
      paste0(paste0("factor(group_pp)",1:K),":ddr_p3"))
nn
group =rep(1:4,3)
period = sort(rep(1:3,4))
slopes = gfe_pp$coeftable[nn,1:2]
slopes = cbind(slopes,group,period)
slopes = data.table(slopes)
setnames(slopes,c("coef_gfe_pp","se_gfe_pp","group_pp","period"))
slopes[,lb_gfe_pp:=coef_gfe_pp-qnorm(0.975)*se_gfe_pp]
slopes[,ub_gfe_pp:=coef_gfe_pp+qnorm(0.975)*se_gfe_pp]
slopes

# Merge back to the data 
dim(dt_bounds)
dt_bounds = merge(dt_bounds,slopes,by=c("group_pp","period"),all=TRUE)
dim(dt_bounds)
summary(dt_bounds)

rm(list=setdiff(ls(),list("dt_bounds","timeb0")))

save.image("Data_new/Data_for_ci.RData")

#############################################################################
# Maps
#############################################################################

rm(list=ls())
load("Data_new/Data_for_ci.RData")

# Average across fips
name_eff = c("coef_gfe","lb_gfe","ub_gfe")
dt_plot = dt_bounds[,lapply(.SD,wtd.mean,weights=corn_area),by="fips",.SDcols=c("corn_area",name_eff)]
dim(dt_plot)
summary(dt_plot)

# Average across fips and periods
name_eff = c("coef_gfe_pp","lb_gfe_pp","ub_gfe_pp")
dt_plotp = dt_bounds[,lapply(.SD,wtd.mean,weights=corn_area),by=c("fips","period"),.SDcols=c("corn_area",name_eff)]
dim(dt_plotp)
summary(dt_plotp)

# Bring US map and code for maps
load("Data_ori/Data_us_map.RData")
source("Code/0. CODE Auxiliary.R")
save_res = "Results"

# Parameters for previous maps for consistency
qq= quantile(dt_plot$coef_gfe,probs=seq(0.05,0.95,0.05))
qq
bmin=-20
bmax=2

### Maps of lower and upper bounds 
# GFE 
plot_map(data=dt_plot[,.(fips,lb_gfe)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfe5_lb",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plot[,.(fips,coef_gfe)],trunc=1,bmin=bmin,bmax=bmax,palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plot[,.(fips,ub_gfe)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfe5_ub",palette="OrRd",rev=1,nc=9)
# GFE per period
plot_map(data=dt_plotp[period==1,.(fips,lb_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p1_lb",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==1,.(fips,coef_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==1,.(fips,ub_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p1_ub",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==2,.(fips,lb_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p2_lb",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==2,.(fips,coef_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==2,.(fips,ub_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p2_ub",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==3,.(fips,lb_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p3_lb",palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==3,.(fips,coef_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,palette="OrRd",rev=1,nc=9)
plot_map(data=dt_plotp[period==3,.(fips,ub_gfe_pp)],trunc=1,bmin=bmin,bmax=bmax,save_name="Map_gfepp_p3_ub",palette="OrRd",rev=1,nc=9)


#############################################################################
# Bootstrap percentiles, mean and variance
# (sample states with replacement weighted by number of observations)
#############################################################################

rm(list=ls())
load("Data_new/Data_for_ci.RData")
source("Code/0. CODE Auxiliary.R")

dt_bounds[,ones:=1]
st = dt_bounds[,lapply(.SD,length),.SDcols="ones",by="state"]
state_list = as.numeric(st$state)
state_obs = as.numeric(st$ones)
ns = length(state_list)
B=1000
sseed=20250102

# Parameters per GFE
K=5
form = as.formula("yield ~ precr + ddr:factor(group) | factor(fips)  + factor(year)^factor(state)")

# Parameters per GFE pp
K=4
form_pp = as.formula("yield ~  precr + ddr_p1:factor(group_pp) + ddr_p2:factor(group_pp) + ddr_p3:factor(group_pp) | 
                  factor(fips)^factor(period) + factor(year)^factor(state)")
nn =c(paste0("ddr_p1:factor(group_pp)",1:K),paste0(paste0("factor(group_pp)",1:K),":ddr_p2"),
      paste0(paste0("factor(group_pp)",1:K),":ddr_p3"))

# To save results
boot_gfe = matrix(1:5,nrow=5,ncol=1)
period = sort(rep(1:3,4))
group =rep(1:4,3)
boot_gfe_pp = cbind(period,group)
data_num = matrix(NA,nrow=4,ncol=B)
rownames(data_num) = c("states id now","states ori sampled","fips id now","fips ori sampled")
res_gfe = matrix(NA,nrow=3,ncol=B)
rownames(res_gfe) = c("nobs","n fips FE","n state-year FE")
res_gfe_pp = matrix(NA,nrow=3,ncol=B)
rownames(res_gfe_pp) = c("nobs","n fips-period FE","n state-year FE")
mg_gfe = matrix(NA,nrow=7,ncol=B)
mg_gfe_pp1 = matrix(NA,nrow=7,ncol=B)
mg_gfe_pp2 = matrix(NA,nrow=7,ncol=B)
mg_gfe_pp3 = matrix(NA,nrow=7,ncol=B)
mg_names =c(paste0("P",c(10,25,50,75,90)),"Mean","Var")
rownames(mg_gfe) = mg_names
rownames(mg_gfe_pp1) = mg_names
rownames(mg_gfe_pp2) = mg_names
rownames(mg_gfe_pp3) = mg_names

for (b in 1:B) {
  
  print(paste0("Sample number ",b))
  set.seed(sseed+b)
  
  ################################################
  ### Data
  ################################################
  
  state_now = sample(state_list,size=length(state_list),prob=state_obs,replace=TRUE)
  state_now = sort(state_now)
  dt_now = dt_bounds[state==state_now[1]] 
  dt_now[,state_new:=1]    
  for (k in 2:ns) {    
    dt_k = dt_bounds[state==state_now[k]] 
    dt_k[,state_new:=k]
    dt_now = rbind(dt_now,dt_k)
  }
  dt_now[,fips_new := state_new*100000+fips]
  data_num[1,b]=length(unique(dt_now$state_new))
  data_num[2,b]=length(unique(dt_now$state))
  data_num[3,b]=length(unique(dt_now$fips_new))
  data_num[4,b]=length(unique(dt_now$fips))
  dt_now[,state:=state_new]
  dt_now[,fips:=fips_new]
  
  ################################################
  ### GFE 
  ################################################
  
  gfe = feols(fml=form,data=dt_now)
  slopes = gfe$coeftable[paste0("ddr:factor(group)",1:5),1:2]
  boot_gfe = cbind(boot_gfe,slopes[,1])
  res_gfe[1,b]=gfe$nobs 
  res_gfe[2,b]=length(unique(gfe$fixef_id$`factor(fips)`))
  res_gfe[3,b]=length(unique(gfe$fixef_id$`factor(year)^factor(state)`))
  
  slopes=data.table(slopes)
  slopes[,group:=.I]
  dt_now = merge(dt_now,slopes[,.(group,Estimate)],by="group",all=TRUE)
  
  mg_gfe[1:5,b]=wtd.quantile(dt_now$Estimate,probs=c(0.1,0.25,0.5,0.75,0.9),weights=dt_now$corn_area,na.rm=TRUE)
  mg_gfe[6,b]=wtd.mean(dt_now$Estimate,weights=dt_now$corn_area,na.rm=TRUE)
  mg_gfe[7,b]=wtd.var(dt_now$Estimate,weights=dt_now$corn_area,na.rm=TRUE)
  dt_now[,Estimate:=NULL]
  
  ################################################
  ### GFE pp
  ################################################
  gfe_pp = feols(fml=form_pp,data=dt_now)
  slopes = gfe_pp$coeftable[nn,1:2]
  boot_gfe_pp = cbind(boot_gfe_pp,slopes[,1])
  res_gfe_pp[1,b]=gfe_pp$nobs 
  res_gfe_pp[2,b]=length(unique(gfe_pp$fixef_id$`factor(fips)^factor(period)`))
  res_gfe_pp[3,b]=length(unique(gfe_pp$fixef_id$`factor(year)^factor(state)`)) 
  
  slopes = data.table(slopes)
  slopes[,period:=period]
  slopes[,group:=group]
  dt_now = merge(dt_now,slopes[,.(group,period,Estimate)],by=c("group","period"),all=TRUE)
  
  mg_gfe_pp1[1:5,b]=wtd.quantile(dt_now[period==1,Estimate],probs=c(0.1,0.25,0.5,0.75,0.9),na.rm=TRUE,weights=dt_now[period==1,corn_area])
  mg_gfe_pp1[6,b]=wtd.mean(dt_now[period==1,Estimate],na.rm=TRUE,weights=dt_now[period==1,corn_area])
  mg_gfe_pp1[7,b]=wtd.var(dt_now[period==1,Estimate],na.rm=TRUE,weights=dt_now[period==1,corn_area])
  mg_gfe_pp2[1:5,b]=wtd.quantile(dt_now[period==2,Estimate],probs=c(0.1,0.25,0.5,0.75,0.9),na.rm=TRUE,weights=dt_now[period==2,corn_area])
  mg_gfe_pp2[6,b]=wtd.mean(dt_now[period==2,Estimate],na.rm=TRUE,weights=dt_now[period==2,corn_area])
  mg_gfe_pp2[7,b]=wtd.var(dt_now[period==2,Estimate],na.rm=TRUE,weights=dt_now[period==2,corn_area])
  mg_gfe_pp3[1:5,b]=wtd.quantile(dt_now[period==3,Estimate],probs=c(0.1,0.25,0.5,0.75,0.9),na.rm=TRUE,weights=dt_now[period==3,corn_area])
  mg_gfe_pp3[6,b]=wtd.mean(dt_now[period==3,Estimate],na.rm=TRUE,weights=dt_now[period==3,corn_area])
  mg_gfe_pp3[7,b]=wtd.var(dt_now[period==3,Estimate],na.rm=TRUE,weights=dt_now[period==3,corn_area])
  dt_now[,Estimate:=NULL]

}

save.image("Data_new/Data_for_ci.RData")

boot_gfe[,1:10]
boot_gfe_pp[,1:10]
data_num[,1:10]
res_gfe[,1:10]
res_gfe_pp[,1:10]
mg_gfe[,1:10]
mg_gfe_pp1[,1:10]
mg_gfe_pp2[,1:10]
mg_gfe_pp3[,1:10]

################################################
#### Some basic checks
################################################

### data_num
# Number of "new" states =31
summary(data_num[1,])
# Number of originally sampled states usually lower
summary(data_num[2,])
# Number of originally sampled fips usually lower than that "new fips"
table(data_num[3,]>data_num[4,])

### res_gfe
# number of fips FE
table(res_gfe[2,]==data_num[3,])
# number of state-year FE < number states x 56
table(res_gfe[3,]<=(data_num[1,]*56))

### res_gfe_pp 
# number of observations <= res_gfe
table(res_gfe_pp[1,]<=res_gfe[1,])
summary(res_gfe_pp[1,])
summary(res_gfe[1,])

# number of fips-period <= number of fips in GFE x3
table(res_gfe_pp[2,]<=(res_gfe[2,]*3))
summary(res_gfe_pp[2,])
summary(res_gfe[2,]*3)

timeb1 = Sys.time()
timeb1-timeb0

#############################################################################
# Tabulate
#############################################################################

rm(list=ls())
load("Data_new/Data_for_ci.RData")
confidence =0.95
probs = c((1-confidence)/2,1-(1-confidence)/2)
probs

### GFE 
# Main estimates
s=wtd.quantile(dt_bounds$coef_gfe,probs=c(0.1,0.25,0.5,0.75,0.9),weights=dt_bounds$corn_area,na.rm=TRUE)
s=c(s,wtd.mean(dt_bounds$coef_gfe,weights=dt_bounds$corn_area,na.rm=TRUE))
s=c(s,wtd.var(dt_bounds$coef_gfe,weights=dt_bounds$corn_area,na.rm=TRUE))
# CI 
s2 = apply(mg_gfe,1,quantile,probs=probs)
s2
# SE
s3 = round(apply(mg_gfe,1,sd),digits=3)
s3 = paste0("(",s3,")")
s3
# 
# Put results in data.table
nn = c(paste0("Percentile ",c(10,25,50,75,90)),"Mean","Variance")
nnames = c(rbind(nn,rep("",dim(s2)[2])))
nvalues = c(rbind(sprintf("%.3f",s),s3))
df = data.table(Est=nnames,Val=nvalues)

### GFE per period
# Main estimates
sq = as.matrix(dt_bounds[,lapply(.SD,wtd.quantile,probs=c(0.1,0.25,0.5,0.75,0.9),weights=corn_area,na.rm=TRUE),.SDcols="coef_gfe_pp", by="period"],ncol=2)
sm = as.matrix(dt_bounds[,lapply(.SD,wtd.mean,weights=corn_area,na.rm=TRUE),.SDcols="coef_gfe_pp", by="period"],ncol=2)
sv = as.matrix(dt_bounds[,lapply(.SD,wtd.var,weights=corn_area,na.rm=TRUE),.SDcols="coef_gfe_pp", by="period"],ncol=2)
# SE
s3_p1 = round(apply(mg_gfe_pp1,1,sd),digits=3)
s3_p2 = round(apply(mg_gfe_pp2,1,sd),digits=3)
s3_p3 = round(apply(mg_gfe_pp3,1,sd),digits=3)
s3_p1 = paste0("(",s3_p1,")")
s3_p2 = paste0("(",s3_p2,")")
s3_p3 = paste0("(",s3_p3,")")
cbind(s3_p1,s3_p2,s3_p3)
# Put results in data.table
nvalues_p1 =  c(rbind(sprintf("%.3f",c(sq[1:5,2],sm[1,2],sv[1,2])),s3_p1))
nvalues_p2 =  c(rbind(sprintf("%.3f",c(sq[6:10,2],sm[2,2],sv[2,2])),s3_p2))
nvalues_p3 =  c(rbind(sprintf("%.3f",c(sq[11:15,2],sm[3,2],sv[3,2])),s3_p3))
df = data.table(Est=nnames,gfe=nvalues,gfe_p1=nvalues_p1,gfe_p2=nvalues_p2,gfe_p3=nvalues_p3)
df

# Save in latex
l=dim(df)[2]
print(xtable(df,type="latex",display=c("s",rep("s",l))),hline.after = NULL,
        file="Results/Table_errors_GFE_GFEpp.tex",include.rownames = FALSE,include.colnames=FALSE,
        sanitize.text.function = function(x){x},only.contents = TRUE)


